
# Replication Code: Cluster Analysis ANES Data

# packages
library(tidyverse)  # tidyverse
library(readxl)     # excel commands
library(haven)      # dta compatibility
library(cluster)    # cluster analysis
library(factoextra) # factor analysis
library(hopkins)    # clustering tendencies (clustertend) 
library(vtable)     # summary stats
library(stats)      # clusterin
library(extrafont)  # fontface for graphs


### Set-Up

# clean and directory
  rm(list=ls())
  setwd("C:/Users/timba/OneDrive - Universität Bayreuth (1)/Uni/Research General/Protest Network Project/empirics")
  
  
#######################################################
### 2020
#######################################################

# load data
  anes_2020 <- read_dta("raw/ANES/anes_timeseries_2020.dta")
# id
  anes_2020 <- rename(anes_2020, "id" = V200001)


### CLARA Aloritgm 

# select variables
  poldata <- select(anes_2020, id, V202163, V202162, V202164, V202161, V202160, V202159, V202171, V202174)
  poldata <- rename(poldata, "laborunions" =V202162, "conservatives" =V202164, "liberals" =V202161, "feminists" =V202160, "chrisfunda" =V202159, "police" =V202171, "blm"=V202174)

# subset, filter data for missing values
  range = seq(0,100)
  poldata <- subset(poldata, laborunions %in% range &	conservatives %in% range & 	liberals %in% range & 	feminists %in% range & 	chrisfunda %in% range & 	police %in% range & 	blm %in% range)
  rm(range)

# standardize
  poldata$laborunions <- as.numeric(scale(poldata$laborunions))
  poldata$conservatives <- as.numeric(scale(poldata$conservatives))
  poldata$liberals <- as.numeric(scale(poldata$liberals))
  poldata$feminists <- as.numeric(scale(poldata$feminists))
  poldata$chrisfunda <- as.numeric(scale(poldata$chrisfunda))
  poldata$police <- as.numeric(scale(poldata$police))
  poldata$blm <- as.numeric(scale(poldata$blm))

# save data -- ids 
  political_cluster_ids <- select(poldata, id)

# select variables for clustering 
  poldata <- select(poldata, laborunions, conservatives, liberals, feminists, chrisfunda, police, blm)

# loop for flexibility regarding number of clusters 
  # number clusters
  c <- 3
  # political cluster
  cluster_coll <- matrix(c(NA), ncol = 2, nrow = dim(poldata)[1], byrow = FALSE)
  cluster_coll[,1] <- seq(1,dim(poldata)[1])
  for(i in c:c){
    # assign k
    k <- i
    
    # cluster matrices
    clara.res <- clara(poldata, k, samples = 50, pamLike = TRUE)
    cluster <- matrix(c(seq(1,dim(poldata)[1]), clara.res$clustering), ncol = 2, byrow = FALSE)
    
    cluster_coll[,i-1] <- cluster[,2]
  }


# illustration of cluster
  fviz_cluster(clara.res,
               palette = c("Paired"), # color palette
               ellipse.type = "t", # Concentration ellipse
               geom = "point", pointsize = 1,
               ggtheme = theme_classic()
  )
  
  fviz_cluster(clara.res,
               palette = c("Paired"), # color palette
               ellipse.type = "t", # Concentration ellipse
               geom = "point", pointsize = 1,
               ggtheme = theme_classic(),
               choose.vars = c("conservatives", "liberals")
  )

# analysis clusters 
  poldata_complete <- cbind(political_cluster_ids, cluster_coll)

# original scores
  poldata_cop <- select(anes_2020, id, V202163, V202162, V202180, V202179, V202164, V202161, V202166, V202160, V202172, V202183, V202159, V202171, V202173, V202174, V202175, V202176, V202177, V202186)
  poldata_cop <- rename(poldata_cop, "bigbusiness" =V202163, "laborunions" =V202162, "capitalists" =V202180, "socialists" =V202179, "conservatives" =V202164, "liberals" =V202161, "homosexuals"=V202166, "feminists" =V202160, "transgender" =V202172, "metoo" =V202183, "chrisfunda" =V202159, "police" =V202171, "scientists" =V202173, "blm"=V202174, "journalists" =V202175, "nato" =V202176, "UN" =V202177, "who" =V202186)

# merge
  poldata_complete <- poldata_complete %>% 
    left_join(poldata_cop, by = "id")

# collapse according to cluster
  poldata_complete$count <- 1
  poldata_complete_coll <- poldata_complete %>% 
    group_by(cluster = poldata_complete[,c]) %>% 
    summarize(bigbusiness = mean(bigbusiness),
              laborunions = mean(laborunions),
              conservatives = mean(conservatives),
              liberals = mean(liberals),
              homosexuals = mean(homosexuals), 
              feminists = mean(feminists),
              transgender = mean(transgender),
              chrisfunda = mean(chrisfunda),
              police = mean(police), 
              scientists = mean(scientists), 
              blm = mean(blm),
              journalists = mean(journalists),
              count = sum(count)
              )

# reshape
  poldata_long <- poldata_complete_coll %>% 
    pivot_longer(!cluster, names_to = "group", values_to = "score")

# keep only selected
  poldata_long <- subset(poldata_long, group %in% names(poldata))
  poldata_long$group <- factor(poldata_long$group, levels = c( "conservatives","police", "chrisfunda", "laborunions", "blm", "feminists", "liberals"), ordered = TRUE)

# bar graph
  select(poldata_complete_coll, cluster, count)
  plot <- poldata_long %>% 
    subset(group != "count")%>% 
    ggplot()+
    geom_bar(aes(x= cluster, y = score, fill = group), stat="identity", position=position_dodge(), color = "black")+
    labs(x = "Cluster Number", y = "Average Thermometer Score", fill = "Theromometer Questions")+
    scale_fill_manual(name = "Average Thermometer Score",
                      values = c("grey30", "grey40", "grey50", "grey60", "grey70", "grey80", "grey90"),
                      breaks= c("conservatives","police", "chrisfunda", "laborunions", "blm", "feminists", "liberals"),
                      labels = c("Conservatives","Police", "Christian Fundamentalists", "Labor Unions", "Black Lives Matter", "Feminists", "Liberals"))+
    theme_minimal()+
    theme(text=element_text(family = "Times New Roman", size=12), axis.title = element_text(size = 14), legend.title = element_text(size = 14))
  plot

# export to pdf
ggsave("graphs/Cluster Illustration Bar Plot_2020.pdf", plot, device = cairo_pdf, width = 10, height = 3)


  
  
  
  
#######################################################
### 2016
#######################################################

### Set-Up

# load data
anes_2016 <- read_dta("raw/ANES/anes_timeseries_2016.dta")
# id
anes_2016 <- rename(anes_2016, "id" = V160001)


### CLARA Aloritgm 

# select variables
poldata <- select(anes_2016, id, V162100, V162098, V162101, V162097, V162096, V162095, V162110, V162113)
poldata <- rename(poldata, "laborunions" =V162098, "conservatives" =V162101, "liberals" =V162097, "feminists" =V162096, "chrisfunda" =V162095, "police" =V162110, "blm"=V162113)

# subset, filter data for missing values
range = seq(0,100)
poldata <- subset(poldata, laborunions %in% range &	conservatives %in% range & 	liberals %in% range & 	feminists %in% range & 	chrisfunda %in% range & 	police %in% range & 	blm %in% range)
rm(range)

# standardize
poldata$laborunions <- as.numeric(scale(poldata$laborunions))
poldata$conservatives <- as.numeric(scale(poldata$conservatives))
poldata$liberals <- as.numeric(scale(poldata$liberals))
poldata$feminists <- as.numeric(scale(poldata$feminists))
poldata$chrisfunda <- as.numeric(scale(poldata$chrisfunda))
poldata$police <- as.numeric(scale(poldata$police))
poldata$blm <- as.numeric(scale(poldata$blm))

# save data -- ids 
political_cluster_ids <- select(poldata, id)

# select variables for clustering 
poldata <- select(poldata, laborunions, conservatives, liberals, feminists, chrisfunda, police, blm)

# loop for flexibility regarding number of clusters 
# number clusters
c <- 3
# political cluster
cluster_coll <- matrix(c(NA), ncol = 2, nrow = dim(poldata)[1], byrow = FALSE)
cluster_coll[,1] <- seq(1,dim(poldata)[1])
for(i in c:c){
  # assign k
  k <- i
  
  # cluster matrices
  clara.res <- clara(poldata, k, samples = 50, pamLike = TRUE)
  cluster <- matrix(c(seq(1,dim(poldata)[1]), clara.res$clustering), ncol = 2, byrow = FALSE)
  
  cluster_coll[,i-1] <- cluster[,2]
}


# illustration of cluster
fviz_cluster(clara.res,
             palette = c("Paired"), # color palette
             ellipse.type = "t", # Concentration ellipse
             geom = "point", pointsize = 1,
             ggtheme = theme_classic()
)

fviz_cluster(clara.res,
             palette = c("Paired"), # color palette
             ellipse.type = "t", # Concentration ellipse
             geom = "point", pointsize = 1,
             ggtheme = theme_classic(),
             choose.vars = c("conservatives", "liberals")
)

# analysis clusters 
poldata_complete <- cbind(political_cluster_ids, cluster_coll)

# original scores
poldata_cop <- select(anes_2016, id, V162100, V162098, V162101, V162097, V162096, V162095, V162110, V162113)
poldata_cop <- rename(poldata_cop, "laborunions" =V162098, "conservatives" =V162101, "liberals" =V162097, "feminists" =V162096, "chrisfunda" =V162095, "police" =V162110, "blm"=V162113)


# merge
poldata_complete <- poldata_complete %>% 
  left_join(poldata_cop, by = "id")
# right = 1, moderates = 2, left = 3
poldata_complete$cluster = ifelse(poldata_complete$`2` == 1, "moderates", ifelse(poldata_complete$`2` == 2, "right-wing", "left-wing"))
poldata_complete$`2` = ifelse(poldata_complete$cluster == "left-wing", 3, ifelse(poldata_complete$cluster=="right-wing", 1, 2))
# poldata_complete_export = select(poldata_complete, id, `2`)
# poldata_complete_export = rename(poldata_complete_export, "cluster" = `2`)
# write.csv(poldata_complete_export, file ="weighting matrices/political_clusters_2016.csv")
poldata_complete = dplyr::select(poldata_complete, - cluster)

# collapse according to cluster
poldata_complete$count <- 1
poldata_complete_coll <- poldata_complete %>% 
  group_by(cluster = poldata_complete[,c]) %>% 
  summarize(laborunions = mean(laborunions),
            conservatives = mean(conservatives),
            liberals = mean(liberals),
            feminists = mean(feminists),
            chrisfunda = mean(chrisfunda),
            police = mean(police), 
            blm = mean(blm),
            count = sum(count)
  )

# reshape
poldata_long <- poldata_complete_coll %>% 
  pivot_longer(!cluster, names_to = "group", values_to = "score")

# keep only selected
poldata_long <- subset(poldata_long, group %in% names(poldata))
poldata_long$group <- factor(poldata_long$group, levels = c( "conservatives","police", "chrisfunda", "laborunions", "blm", "feminists", "liberals"), ordered = TRUE)

# bar graph
select(poldata_complete_coll, cluster, count)
plot <- poldata_long %>% 
  subset(group != "count")%>% 
  ggplot()+
  geom_bar(aes(x= cluster, y = score, fill = group), stat="identity", position=position_dodge(), color = "black")+
  labs(x = "Cluster Number", y = "Average Thermometer Score", fill = "Theromometer Questions")+
  scale_fill_manual(name = "Average Thermometer Score",
                    values = c("grey30", "grey40", "grey50", "grey60", "grey70", "grey80", "grey90"),
                    breaks= c("conservatives","police", "chrisfunda", "laborunions", "blm", "feminists", "liberals"),
                    labels = c("Conservatives","Police", "Christian Fundamentalists", "Labor Unions", "Black Lives Matter", "Feminists", "Liberals"))+
  theme_minimal()+
  theme(text=element_text(family = "Times New Roman", size=12), axis.title = element_text(size = 14), legend.title = element_text(size = 14))
plot
  
# export to pdf
ggsave("graphs/Cluster Illustration Bar Plot_2016.pdf", plot, device = cairo_pdf, width = 10, height = 3)

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  